import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
df=pd.read_csv("C:\\Users\\91741\\Downloads\\train (2).csv")
df.shape
(1460, 81)
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
df.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2003 | 2003 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 196.0 | Gd | TA | PConc | Gd | TA | No | GLQ | 706 | Unf | 0 | 150 | 856 | GasA | Ex | Y | SBrkr | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | Typ | 0 | NaN | Attchd | 2003.0 | RFn | 2 | 548 | TA | TA | Y | 0 | 61 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | 6 | 8 | 1976 | 1976 | Gable | CompShg | MetalSd | MetalSd | None | 0.0 | TA | TA | CBlock | Gd | TA | Gd | ALQ | 978 | Unf | 0 | 284 | 1262 | GasA | Ex | Y | SBrkr | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1976.0 | RFn | 2 | 460 | TA | TA | Y | 298 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2001 | 2002 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 162.0 | Gd | TA | PConc | Gd | TA | Mn | GLQ | 486 | Unf | 0 | 434 | 920 | GasA | Ex | Y | SBrkr | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | Typ | 1 | TA | Attchd | 2001.0 | RFn | 2 | 608 | TA | TA | Y | 0 | 42 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1915 | 1970 | Gable | CompShg | Wd Sdng | Wd Shng | None | 0.0 | TA | TA | BrkTil | TA | Gd | No | ALQ | 216 | Unf | 0 | 540 | 756 | GasA | Gd | Y | SBrkr | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Detchd | 1998.0 | Unf | 3 | 642 | TA | TA | Y | 0 | 35 | 272 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2000 | 2000 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 350.0 | Gd | TA | PConc | Gd | TA | Av | GLQ | 655 | Unf | 0 | 490 | 1145 | GasA | Ex | Y | SBrkr | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | Typ | 1 | TA | Attchd | 2000.0 | RFn | 3 | 836 | TA | TA | Y | 192 | 84 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1460 entries, 0 to 1459 Data columns (total 81 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 1460 non-null int64 1 MSSubClass 1460 non-null int64 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 LotArea 1460 non-null int64 5 Street 1460 non-null object 6 Alley 91 non-null object 7 LotShape 1460 non-null object 8 LandContour 1460 non-null object 9 Utilities 1460 non-null object 10 LotConfig 1460 non-null object 11 LandSlope 1460 non-null object 12 Neighborhood 1460 non-null object 13 Condition1 1460 non-null object 14 Condition2 1460 non-null object 15 BldgType 1460 non-null object 16 HouseStyle 1460 non-null object 17 OverallQual 1460 non-null int64 18 OverallCond 1460 non-null int64 19 YearBuilt 1460 non-null int64 20 YearRemodAdd 1460 non-null int64 21 RoofStyle 1460 non-null object 22 RoofMatl 1460 non-null object 23 Exterior1st 1460 non-null object 24 Exterior2nd 1460 non-null object 25 MasVnrType 1452 non-null object 26 MasVnrArea 1452 non-null float64 27 ExterQual 1460 non-null object 28 ExterCond 1460 non-null object 29 Foundation 1460 non-null object 30 BsmtQual 1423 non-null object 31 BsmtCond 1423 non-null object 32 BsmtExposure 1422 non-null object 33 BsmtFinType1 1423 non-null object 34 BsmtFinSF1 1460 non-null int64 35 BsmtFinType2 1422 non-null object 36 BsmtFinSF2 1460 non-null int64 37 BsmtUnfSF 1460 non-null int64 38 TotalBsmtSF 1460 non-null int64 39 Heating 1460 non-null object 40 HeatingQC 1460 non-null object 41 CentralAir 1460 non-null object 42 Electrical 1459 non-null object 43 1stFlrSF 1460 non-null int64 44 2ndFlrSF 1460 non-null int64 45 LowQualFinSF 1460 non-null int64 46 GrLivArea 1460 non-null int64 47 BsmtFullBath 1460 non-null int64 48 BsmtHalfBath 1460 non-null int64 49 FullBath 1460 non-null int64 50 HalfBath 1460 non-null int64 51 BedroomAbvGr 1460 non-null int64 52 KitchenAbvGr 1460 non-null int64 53 KitchenQual 1460 non-null object 54 TotRmsAbvGrd 1460 non-null int64 55 Functional 1460 non-null object 56 Fireplaces 1460 non-null int64 57 FireplaceQu 770 non-null object 58 GarageType 1379 non-null object 59 GarageYrBlt 1379 non-null float64 60 GarageFinish 1379 non-null object 61 GarageCars 1460 non-null int64 62 GarageArea 1460 non-null int64 63 GarageQual 1379 non-null object 64 GarageCond 1379 non-null object 65 PavedDrive 1460 non-null object 66 WoodDeckSF 1460 non-null int64 67 OpenPorchSF 1460 non-null int64 68 EnclosedPorch 1460 non-null int64 69 3SsnPorch 1460 non-null int64 70 ScreenPorch 1460 non-null int64 71 PoolArea 1460 non-null int64 72 PoolQC 7 non-null object 73 Fence 281 non-null object 74 MiscFeature 54 non-null object 75 MiscVal 1460 non-null int64 76 MoSold 1460 non-null int64 77 YrSold 1460 non-null int64 78 SaleType 1460 non-null object 79 SaleCondition 1460 non-null object 80 SalePrice 1460 non-null int64 dtypes: float64(3), int64(35), object(43) memory usage: 924.0+ KB
df.describe()
| Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | TotRmsAbvGrd | Fireplaces | GarageYrBlt | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1460.000000 | 1460.000000 | 1201.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1452.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1379.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 |
| mean | 730.500000 | 56.897260 | 70.049958 | 10516.828082 | 6.099315 | 5.575342 | 1971.267808 | 1984.865753 | 103.685262 | 443.639726 | 46.549315 | 567.240411 | 1057.429452 | 1162.626712 | 346.992466 | 5.844521 | 1515.463699 | 0.425342 | 0.057534 | 1.565068 | 0.382877 | 2.866438 | 1.046575 | 6.517808 | 0.613014 | 1978.506164 | 1.767123 | 472.980137 | 94.244521 | 46.660274 | 21.954110 | 3.409589 | 15.060959 | 2.758904 | 43.489041 | 6.321918 | 2007.815753 | 180921.195890 |
| std | 421.610009 | 42.300571 | 24.284752 | 9981.264932 | 1.382997 | 1.112799 | 30.202904 | 20.645407 | 181.066207 | 456.098091 | 161.319273 | 441.866955 | 438.705324 | 386.587738 | 436.528436 | 48.623081 | 525.480383 | 0.518911 | 0.238753 | 0.550916 | 0.502885 | 0.815778 | 0.220338 | 1.625393 | 0.644666 | 24.689725 | 0.747315 | 213.804841 | 125.338794 | 66.256028 | 61.119149 | 29.317331 | 55.757415 | 40.177307 | 496.123024 | 2.703626 | 1.328095 | 79442.502883 |
| min | 1.000000 | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 1900.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 34900.000000 |
| 25% | 365.750000 | 20.000000 | 59.000000 | 7553.500000 | 5.000000 | 5.000000 | 1954.000000 | 1967.000000 | 0.000000 | 0.000000 | 0.000000 | 223.000000 | 795.750000 | 882.000000 | 0.000000 | 0.000000 | 1129.500000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 5.000000 | 0.000000 | 1961.000000 | 1.000000 | 334.500000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 5.000000 | 2007.000000 | 129975.000000 |
| 50% | 730.500000 | 50.000000 | 69.000000 | 9478.500000 | 6.000000 | 5.000000 | 1973.000000 | 1994.000000 | 0.000000 | 383.500000 | 0.000000 | 477.500000 | 991.500000 | 1087.000000 | 0.000000 | 0.000000 | 1464.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 | 1.000000 | 6.000000 | 1.000000 | 1980.000000 | 2.000000 | 480.000000 | 0.000000 | 25.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 163000.000000 |
| 75% | 1095.250000 | 70.000000 | 80.000000 | 11601.500000 | 7.000000 | 6.000000 | 2000.000000 | 2004.000000 | 166.000000 | 712.250000 | 0.000000 | 808.000000 | 1298.250000 | 1391.250000 | 728.000000 | 0.000000 | 1776.750000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 7.000000 | 1.000000 | 2002.000000 | 2.000000 | 576.000000 | 168.000000 | 68.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 214000.000000 |
| max | 1460.000000 | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 5644.000000 | 1474.000000 | 2336.000000 | 6110.000000 | 4692.000000 | 2065.000000 | 572.000000 | 5642.000000 | 3.000000 | 2.000000 | 3.000000 | 2.000000 | 8.000000 | 3.000000 | 14.000000 | 3.000000 | 2010.000000 | 4.000000 | 1418.000000 | 857.000000 | 547.000000 | 552.000000 | 508.000000 | 480.000000 | 738.000000 | 15500.000000 | 12.000000 | 2010.000000 | 755000.000000 |
Exploring Missing Values
features_nan=[feature for feature in df.columns if df[feature].isnull().sum()>0]
len(features_nan)
19
for feature in features_nan:
print(feature,':',np.round(df[feature].isnull().mean()*100,2),'% missing values')
LotFrontage : 17.74 % missing values Alley : 93.77 % missing values MasVnrType : 0.55 % missing values MasVnrArea : 0.55 % missing values BsmtQual : 2.53 % missing values BsmtCond : 2.53 % missing values BsmtExposure : 2.6 % missing values BsmtFinType1 : 2.53 % missing values BsmtFinType2 : 2.6 % missing values Electrical : 0.07 % missing values FireplaceQu : 47.26 % missing values GarageType : 5.55 % missing values GarageYrBlt : 5.55 % missing values GarageFinish : 5.55 % missing values GarageQual : 5.55 % missing values GarageCond : 5.55 % missing values PoolQC : 99.52 % missing values Fence : 80.75 % missing values MiscFeature : 96.3 % missing values
sns.heatmap(df.isnull(),yticklabels=False,cmap='coolwarm')
<Axes: >
Effect Of missing Values on Dependent Features
for feature in features_nan:
data=df.copy()
data[feature]=np.where(data[feature].isnull(),1,0)
data.groupby(feature)['SalePrice'].median().plot.bar()
plt.xlabel(feature)
plt.ylabel('SalePrice')
plt.title(feature)
plt.show()
Missing Values are affecting our dependent features so its importanyt to handle these
num_feat=[feature for feature in df.columns if df[feature].dtypes!='object']
len(num_feat)
38
Missing Values in Numerical Features
num_nan=[feature for feature in num_feat if df[feature].isnull().sum()>0]
len(num_nan)
3
for feature in num_nan:
print(feature,':',np.round(df[feature].isnull().mean()*100,2),'% missing values')
LotFrontage : 17.74 % missing values MasVnrArea : 0.55 % missing values GarageYrBlt : 5.55 % missing values
year_feat=[feature for feature in num_feat if 'Year' in feature or 'Yr'in feature]
len(year_feat)
4
Visualising Year Features
data=df.copy()
data.groupby('YrSold')['SalePrice'].median().plot()
plt.xlabel('YrSold')
plt.ylabel('SalePrice')
plt.title(feature)
Text(0.5, 1.0, 'GarageYrBlt')
As Year Passby the SalePrice is decreasing which is wrong
Exploring More on Year Features
for feature in year_feat:
data=df.copy()
if feature!='YrSold':
data[feature]=data['YrSold']-data[feature]
plt.scatter(data[feature],data['SalePrice'])
plt.xlabel(feature)
plt.ylabel('SalePrice')
plt.show()
## Above is True Picture of SalePrice with year so we use this for our year_feat modification
Looking Types OF Numerical Features:
1.Discrete Features
2.Continous Features
Disc_feat=[feature for feature in num_feat if len(df[feature].unique())<25 and feature not in year_feat+['Id']]
cont_feat=[feature for feature in num_feat if feature not in Disc_feat + year_feat+['Id']]
len(Disc_feat)
17
len(cont_feat)
16
Lets See The Distribution Of these Disc_feat with SalePrice(Dependent Feature)
for feature in Disc_feat:
data=df.copy()
data.groupby(feature)['SalePrice'].median().plot.bar()
plt.xlabel(feature)
plt.ylabel('SalePrice')
plt.title(feature)
plt.show()
Lets See The Distribution Of these cont_feat with SalePrice(Dependent Feature)
for feature in cont_feat:
data=df.copy()
data[feature].hist(bins=20)
plt.xlabel(feature)
plt.ylabel('Count')
plt.show()
for feature in cont_feat:
sns.boxplot(df[feature])
plt.title(feature)
print(feature,':',df[feature].skew())
plt.show()
LotFrontage : 2.163569142324884
LotArea : 12.207687851233496
MasVnrArea : 2.669084210182863
BsmtFinSF1 : 1.685503071910789
BsmtFinSF2 : 4.255261108933303
BsmtUnfSF : 0.9202684528039037
TotalBsmtSF : 1.5242545490627664
1stFlrSF : 1.3767566220336365
2ndFlrSF : 0.8130298163023265
GrLivArea : 1.3665603560164552
GarageArea : 0.17998090674623907
WoodDeckSF : 1.5413757571931312
OpenPorchSF : 2.3643417403694404
EnclosedPorch : 3.08987190371177
ScreenPorch : 4.122213743143115
SalePrice : 1.8828757597682129
Outliers removed on basis of deistribution whether its normal or skewed
Outliers are not removed for dependent features
cont_feat
len(cont_feat)
16
featuress=['LotFrontage','LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
'2ndFlrSF','GrLivArea','WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch','GarageArea']
Capping Method for removing Outliers
s=[]
for feature in featuress:
q1=df[feature].quantile(0.25)
q3=df[feature].quantile(0.75)
iqr=q3-q1
upper_bridge=q3+1.5*iqr
lower_bridge=q1-1.5*iqr
l=upper_bridge,lower_bridge
print(feature,':',df[feature].skew())
s.append(l)
print(s)
LotFrontage : 2.163569142324884 LotArea : 12.207687851233496 MasVnrArea : 2.669084210182863 BsmtFinSF1 : 1.685503071910789 BsmtFinSF2 : 4.255261108933303 BsmtUnfSF : 0.9202684528039037 TotalBsmtSF : 1.5242545490627664 1stFlrSF : 1.3767566220336365 2ndFlrSF : 0.8130298163023265 GrLivArea : 1.3665603560164552 WoodDeckSF : 1.5413757571931312 OpenPorchSF : 2.3643417403694404 EnclosedPorch : 3.08987190371177 ScreenPorch : 4.122213743143115 GarageArea : 0.17998090674623907 [(111.5, 27.5), (17673.5, 1481.5), (415.0, -249.0), (1780.625, -1068.375), (0.0, 0.0), (1685.5, -654.5), (2052.0, 42.0), (2155.125, 118.125), (1820.0, -1092.0), (2747.625, 158.625), (420.0, -252.0), (170.0, -102.0), (0.0, 0.0), (0.0, 0.0), (938.25, -27.75)]
s=[(111.5, 27.5), (17673.5, 1481.5), (415.0, -249.0), (1780.625, -1068.375), (0.0, 0.0), (1685.5, -654.5), (2052.0, 42.0), (2155.125, 118.125), (1820.0, -1092.0), (2747.625, 158.625), (420.0, -252.0), (170.0, -102.0), (0.0, 0.0), (0.0, 0.0), (938.25, -27.75)]
def impute(date, featuress, s):
for idx, feature in enumerate(featuress):
upper_bridge, lower_bridge = s[idx]
date[feature] = np.where(df[feature] > upper_bridge, upper_bridge,
np.where(df[feature] < lower_bridge, lower_bridge,
df[feature]))
for feat in featuress:
sns.boxplot(df[feat])
plt.title(feat)
plt.show()
df.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450.0 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2003 | 2003 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 196.0 | Gd | TA | PConc | Gd | TA | No | GLQ | 706.0 | Unf | 0.0 | 150.0 | 856.0 | GasA | Ex | Y | SBrkr | 856.0 | 854.0 | 0 | 1710.0 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | Typ | 0 | NaN | Attchd | 2003.0 | RFn | 2 | 548.0 | TA | TA | Y | 0.0 | 61.0 | 0.0 | 0 | 0.0 | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600.0 | Pave | NaN | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | 6 | 8 | 1976 | 1976 | Gable | CompShg | MetalSd | MetalSd | None | 0.0 | TA | TA | CBlock | Gd | TA | Gd | ALQ | 978.0 | Unf | 0.0 | 284.0 | 1262.0 | GasA | Ex | Y | SBrkr | 1262.0 | 0.0 | 0 | 1262.0 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1976.0 | RFn | 2 | 460.0 | TA | TA | Y | 298.0 | 0.0 | 0.0 | 0 | 0.0 | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250.0 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2001 | 2002 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 162.0 | Gd | TA | PConc | Gd | TA | Mn | GLQ | 486.0 | Unf | 0.0 | 434.0 | 920.0 | GasA | Ex | Y | SBrkr | 920.0 | 866.0 | 0 | 1786.0 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | Typ | 1 | TA | Attchd | 2001.0 | RFn | 2 | 608.0 | TA | TA | Y | 0.0 | 42.0 | 0.0 | 0 | 0.0 | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550.0 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1915 | 1970 | Gable | CompShg | Wd Sdng | Wd Shng | None | 0.0 | TA | TA | BrkTil | TA | Gd | No | ALQ | 216.0 | Unf | 0.0 | 540.0 | 756.0 | GasA | Gd | Y | SBrkr | 961.0 | 756.0 | 0 | 1717.0 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Detchd | 1998.0 | Unf | 3 | 642.0 | TA | TA | Y | 0.0 | 35.0 | 0.0 | 0 | 0.0 | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260.0 | Pave | NaN | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2000 | 2000 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 350.0 | Gd | TA | PConc | Gd | TA | Av | GLQ | 655.0 | Unf | 0.0 | 490.0 | 1145.0 | GasA | Ex | Y | SBrkr | 1145.0 | 1053.0 | 0 | 2198.0 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | Typ | 1 | TA | Attchd | 2000.0 | RFn | 3 | 836.0 | TA | TA | Y | 192.0 | 84.0 | 0.0 | 0 | 0.0 | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
cat_var=[feature for feature in df.columns if df[feature].dtypes=='object']
len(cat_var)
43
cat_nan=[feature for feature in cat_var if df[feature].isnull().sum()>0]
len(cat_nan)
16
for feature in cat_nan:
print(feature,':',np.round(df[feature].isnull().mean()*100,2),'% missing values')
Alley : 93.77 % missing values MasVnrType : 0.55 % missing values BsmtQual : 2.53 % missing values BsmtCond : 2.53 % missing values BsmtExposure : 2.6 % missing values BsmtFinType1 : 2.53 % missing values BsmtFinType2 : 2.6 % missing values Electrical : 0.07 % missing values FireplaceQu : 47.26 % missing values GarageType : 5.55 % missing values GarageFinish : 5.55 % missing values GarageQual : 5.55 % missing values GarageCond : 5.55 % missing values PoolQC : 99.52 % missing values Fence : 80.75 % missing values MiscFeature : 96.3 % missing values
for feature in cat_var:
data=df.copy()
data.groupby(feature)['SalePrice'].median().plot.bar()
plt.title(feature)
plt.xlabel(feature)
plt.ylabel('Median Saleprice')
plt.show()
## From above we can see how different features are affecting saleprice
We apply train test split before feature engineering in order to avoid data leakage
from sklearn.model_selection import train_test_split
train,test = train_test_split(df,test_size=0.2,random_state=42)
train.shape, test.shape
((1168, 81), (292, 81))
1.Categorical Features
cat_nan_features=[feature for feature in train.columns if train[feature].dtypes=='object' and train[feature].isnull().sum()>0]
len(cat_nan_features)
16
for feature in cat_nan_features:
print(feature,':',np.round(train[feature].isnull().mean()*100,2),'% missing values')
Alley : 93.66 % missing values MasVnrType : 0.51 % missing values BsmtQual : 2.4 % missing values BsmtCond : 2.4 % missing values BsmtExposure : 2.4 % missing values BsmtFinType1 : 2.4 % missing values BsmtFinType2 : 2.4 % missing values Electrical : 0.09 % missing values FireplaceQu : 46.83 % missing values GarageType : 5.48 % missing values GarageFinish : 5.48 % missing values GarageQual : 5.48 % missing values GarageCond : 5.48 % missing values PoolQC : 99.49 % missing values Fence : 80.05 % missing values MiscFeature : 96.06 % missing values
def replace_nan(train,cat_nan_features):
dff=train.copy()
dff[cat_nan_features]=dff[cat_nan_features].fillna('Missing')
return dff
train=replace_nan(train,cat_nan_features)
train[cat_nan_features].isnull().sum()
Alley 0 MasVnrType 0 BsmtQual 0 BsmtCond 0 BsmtExposure 0 BsmtFinType1 0 BsmtFinType2 0 Electrical 0 FireplaceQu 0 GarageType 0 GarageFinish 0 GarageQual 0 GarageCond 0 PoolQC 0 Fence 0 MiscFeature 0 dtype: int64
train.shape
(1168, 81)
num_nan_features = [feature for feature in train.columns if train[feature].dtypes!='O' and train[feature].isnull().sum()>0]
len(num_nan_features)
3
for feature in num_nan_features:
print(feature,':',np.round(df[feature].isnull().mean()*100,2),'% missing values')
LotFrontage : 17.74 % missing values MasVnrArea : 0.55 % missing values GarageYrBlt : 5.55 % missing values
for feature in num_nan_features:
median_value=train[feature].median()
train[feature+"_nan"] = np.where(train[feature].isnull(),1,0)
train[feature].fillna(median_value,inplace=True)
train[num_nan_features].isnull().sum()
LotFrontage 0 MasVnrArea 0 GarageYrBlt 0 dtype: int64
train.shape
(1168, 84)
Handling Temporal Variables
year_feat
['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
features=['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']
for feature in year_feat:
train[feature]=train['YrSold']-train[feature]
train[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']].head()
| YearBuilt | YearRemodAdd | GarageYrBlt | |
|---|---|---|---|
| 254 | 53 | 53 | 53.0 |
| 1066 | 16 | 15 | 16.0 |
| 638 | 98 | 58 | 28.0 |
| 799 | 70 | 57 | 68.0 |
| 380 | 86 | 60 | 86.0 |
train.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | LotFrontage_nan | MasVnrArea_nan | GarageYrBlt_nan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 254 | 255 | 20 | RL | 70.0 | 8400.0 | Pave | Missing | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 5 | 6 | 53 | 53 | Gable | CompShg | MetalSd | MetalSd | None | 0.0 | TA | Gd | CBlock | TA | TA | No | Rec | 922.0 | Unf | 0.0 | 392.0 | 1314.0 | GasA | TA | Y | SBrkr | 1314.0 | 0.0 | 0 | 1314.0 | 1 | 0 | 1 | 0 | 3 | 1 | TA | 5 | Typ | 0 | Missing | Attchd | 53.0 | RFn | 1 | 294.0 | TA | TA | Y | 250.0 | 0.0 | 0.0 | 0 | 0.0 | 0 | Missing | Missing | Missing | 0 | 6 | 0 | WD | Normal | 145000 | 0 | 0 | 0 |
| 1066 | 1067 | 60 | RL | 59.0 | 7837.0 | Pave | Missing | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 6 | 7 | 16 | 15 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | Gd | TA | PConc | Gd | TA | No | Unf | 0.0 | Unf | 0.0 | 799.0 | 799.0 | GasA | Gd | Y | SBrkr | 799.0 | 772.0 | 0 | 1571.0 | 0 | 0 | 2 | 1 | 3 | 1 | TA | 7 | Typ | 1 | TA | Attchd | 16.0 | RFn | 2 | 380.0 | TA | TA | Y | 0.0 | 40.0 | 0.0 | 0 | 0.0 | 0 | Missing | Missing | Missing | 0 | 5 | 0 | WD | Normal | 178000 | 0 | 0 | 0 |
| 638 | 639 | 30 | RL | 67.0 | 8777.0 | Pave | Missing | Reg | Lvl | AllPub | Inside | Gtl | Edwards | Feedr | Norm | 1Fam | 1Story | 5 | 7 | 98 | 58 | Gable | CompShg | MetalSd | Wd Sdng | None | 0.0 | TA | TA | CBlock | Fa | TA | No | Unf | 0.0 | Unf | 0.0 | 796.0 | 796.0 | GasA | Gd | Y | FuseA | 796.0 | 0.0 | 0 | 796.0 | 0 | 0 | 1 | 0 | 2 | 1 | TA | 4 | Typ | 0 | Missing | Missing | 28.0 | Missing | 0 | 0.0 | Missing | Missing | P | 328.0 | 0.0 | 0.0 | 0 | 0.0 | 0 | Missing | MnPrv | Missing | 0 | 5 | 0 | WD | Normal | 85000 | 0 | 0 | 1 |
| 799 | 800 | 50 | RL | 60.0 | 7200.0 | Pave | Missing | Reg | Lvl | AllPub | Corner | Gtl | SWISU | Feedr | Norm | 1Fam | 1.5Fin | 5 | 7 | 70 | 57 | Gable | CompShg | Wd Sdng | Wd Sdng | BrkFace | 252.0 | TA | TA | BrkTil | Gd | TA | No | ALQ | 569.0 | Unf | 0.0 | 162.0 | 731.0 | GasA | Ex | Y | SBrkr | 981.0 | 787.0 | 0 | 1768.0 | 1 | 0 | 1 | 1 | 3 | 1 | Gd | 7 | Typ | 2 | TA | Detchd | 68.0 | Unf | 1 | 240.0 | TA | TA | Y | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0 | Missing | MnPrv | Missing | 0 | 6 | 0 | WD | Normal | 175000 | 0 | 0 | 0 |
| 380 | 381 | 50 | RL | 50.0 | 5000.0 | Pave | Pave | Reg | Lvl | AllPub | Inside | Gtl | SWISU | Norm | Norm | 1Fam | 1.5Fin | 5 | 6 | 86 | 60 | Gable | CompShg | BrkFace | Wd Sdng | None | 0.0 | TA | TA | BrkTil | TA | TA | No | LwQ | 218.0 | Unf | 0.0 | 808.0 | 1026.0 | GasA | TA | Y | SBrkr | 1026.0 | 665.0 | 0 | 1691.0 | 0 | 0 | 2 | 0 | 3 | 1 | Gd | 6 | Typ | 1 | Gd | Detchd | 86.0 | Unf | 1 | 308.0 | TA | TA | Y | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0 | Missing | Missing | Missing | 0 | 5 | 0 | WD | Normal | 127000 | 0 | 0 | 0 |
train.columns
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
'SaleCondition', 'SalePrice', 'LotFrontage_nan', 'MasVnrArea_nan',
'GarageYrBlt_nan'],
dtype='object')
for feature in train.columns:
plt.hist(train[feature],bins=20)
plt.title(feature)
plt.show()
Disc_feat=[feature for feature in num_feat if len(train[feature].unique())<25 and feature not in year_feat+['Id']]
cont_feat=[feature for feature in num_feat if feature not in Disc_feat + year_feat+['Id']]
x=[feature for feature in train.columns if train[feature].dtypes!='object'and feature in cont_feat]
x
['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'SalePrice']
for feature in x:
print(feature,':',train[feature].skew())
LotFrontage : 0.07906515289033489 LotArea : 0.19342857588029116 MasVnrArea : 1.2748890788415927 BsmtFinSF1 : 0.7230258028627954 BsmtUnfSF : 0.7904304940072501 TotalBsmtSF : 0.23478699338365772 1stFlrSF : 0.6113893091750534 2ndFlrSF : 0.7878996002666335 GrLivArea : 0.5796735095315892 GarageArea : -0.11152945885964144 WoodDeckSF : 1.0903417142244038 OpenPorchSF : 1.0545071966412067 SalePrice : 1.743128561420854
skew_feature=['MasVnrArea','BsmtFinSF1','BsmtUnfSF','1stFlrSF','2ndFlrSF','GrLivArea','WoodDeckSF','OpenPorchSF','SalePrice']
import scipy.stats as stat
for feature in skew_feature:
train[feature],parameters=stat.boxcox(train[feature]+1)
print(feature,':',train[feature].skew())
MasVnrArea : 0.37499775630254123 BsmtFinSF1 : -0.42497082737217173 BsmtUnfSF : -0.28761928691335925 1stFlrSF : -0.004253922621941451 2ndFlrSF : 0.25972021556529684 GrLivArea : -0.007010546332412592 WoodDeckSF : 0.15517682491755982 OpenPorchSF : -0.06588271709629863 SalePrice : -0.008051284584600405
train.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | LotFrontage_nan | MasVnrArea_nan | GarageYrBlt_nan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 254 | 255 | 20 | RL | 70.0 | 8400.0 | Pave | Missing | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 5 | 6 | 53 | 53 | Gable | CompShg | MetalSd | MetalSd | None | 0.000000 | TA | Gd | CBlock | TA | TA | No | Rec | 16.819902 | Unf | 0.0 | 33.897101 | 1314.0 | GasA | TA | Y | SBrkr | 12.228361 | 0.000000 | 0 | 17.622338 | 1 | 0 | 1 | 0 | 3 | 1 | TA | 5 | Typ | 0 | Missing | Attchd | 53.0 | RFn | 1 | 294.0 | TA | TA | Y | 4.338126 | 0.000000 | 0.0 | 0 | 0.0 | 0 | Missing | Missing | Missing | 0 | 6 | 0 | WD | Normal | 7.530705 | 0 | 0 | 0 |
| 1066 | 1067 | 60 | RL | 59.0 | 7837.0 | Pave | Missing | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 6 | 7 | 16 | 15 | Gable | CompShg | VinylSd | VinylSd | None | 0.000000 | Gd | TA | PConc | Gd | TA | No | Unf | 0.000000 | Unf | 0.0 | 48.370017 | 799.0 | GasA | Gd | Y | SBrkr | 10.942670 | 4.610315 | 0 | 18.514443 | 0 | 0 | 2 | 1 | 3 | 1 | TA | 7 | Typ | 1 | TA | Attchd | 16.0 | RFn | 2 | 380.0 | TA | TA | Y | 0.000000 | 4.090227 | 0.0 | 0 | 0.0 | 0 | Missing | Missing | Missing | 0 | 5 | 0 | WD | Normal | 7.605930 | 0 | 0 | 0 |
| 638 | 639 | 30 | RL | 67.0 | 8777.0 | Pave | Missing | Reg | Lvl | AllPub | Inside | Gtl | Edwards | Feedr | Norm | 1Fam | 1Story | 5 | 7 | 98 | 58 | Gable | CompShg | MetalSd | Wd Sdng | None | 0.000000 | TA | TA | CBlock | Fa | TA | No | Unf | 0.000000 | Unf | 0.0 | 48.279947 | 796.0 | GasA | Gd | Y | FuseA | 10.933281 | 0.000000 | 0 | 15.300059 | 0 | 0 | 1 | 0 | 2 | 1 | TA | 4 | Typ | 0 | Missing | Missing | 28.0 | Missing | 0 | 0.0 | Missing | Missing | P | 4.499425 | 0.000000 | 0.0 | 0 | 0.0 | 0 | Missing | MnPrv | Missing | 0 | 5 | 0 | WD | Normal | 7.328603 | 0 | 0 | 1 |
| 799 | 800 | 50 | RL | 60.0 | 7200.0 | Pave | Missing | Reg | Lvl | AllPub | Corner | Gtl | SWISU | Feedr | Norm | 1Fam | 1.5Fin | 5 | 7 | 70 | 57 | Gable | CompShg | Wd Sdng | Wd Sdng | BrkFace | 3.212329 | TA | TA | BrkTil | Gd | TA | No | ALQ | 14.572489 | Unf | 0.0 | 21.588988 | 731.0 | GasA | Ex | Y | SBrkr | 11.462364 | 4.619083 | 0 | 19.124129 | 1 | 0 | 1 | 1 | 3 | 1 | Gd | 7 | Typ | 2 | TA | Detchd | 68.0 | Unf | 1 | 240.0 | TA | TA | Y | 0.000000 | 0.000000 | 0.0 | 0 | 0.0 | 0 | Missing | MnPrv | Missing | 0 | 6 | 0 | WD | Normal | 7.599743 | 0 | 0 | 0 |
| 380 | 381 | 50 | RL | 50.0 | 5000.0 | Pave | Pave | Reg | Lvl | AllPub | Inside | Gtl | SWISU | Norm | Norm | 1Fam | 1.5Fin | 5 | 6 | 86 | 60 | Gable | CompShg | BrkFace | Wd Sdng | None | 0.000000 | TA | TA | BrkTil | TA | TA | No | LwQ | 10.796435 | Unf | 0.0 | 48.639168 | 1026.0 | GasA | TA | Y | SBrkr | 11.577920 | 4.541664 | 0 | 18.892444 | 0 | 0 | 2 | 0 | 3 | 1 | Gd | 6 | Typ | 1 | Gd | Detchd | 86.0 | Unf | 1 | 308.0 | TA | TA | Y | 0.000000 | 0.000000 | 0.0 | 0 | 0.0 | 0 | Missing | Missing | Missing | 0 | 5 | 0 | WD | Normal | 7.481387 | 0 | 0 | 0 |
for feature in skew_feature:
sns.histplot(train[feature])
print(feature,':',train[feature].skew())
plt.show()
MasVnrArea : 0.37499775630254123
BsmtFinSF1 : -0.42497082737217173
BsmtUnfSF : -0.28761928691335925
1stFlrSF : -0.004253922621941451
2ndFlrSF : 0.25972021556529684
GrLivArea : -0.007010546332412592
WoodDeckSF : 0.15517682491755982
OpenPorchSF : -0.06588271709629863
SalePrice : -0.008051284584600405
If the no of a particular category is less than 1% of total observation then we will assign a new category for it
for feature in cat_var:
temp=train.groupby(feature)['SalePrice'].count()/len(train)
temp_df=temp[temp>0.01].index
train[feature] = np.where( train[feature].isin(temp_df), train[feature], 'Rare_var' )
all_labels=[]
for feature in cat_var:
labels_ordered=train.groupby(feature)['SalePrice'].mean().sort_values().index
labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
all_labels.append(labels_ordered)
train[feature]=train[feature].map(labels_ordered)
print(all_labels)
[{'Rare_var': 0, 'RM': 1, 'RH': 2, 'RL': 3, 'FV': 4}, {'Rare_var': 0, 'Pave': 1}, {'Grvl': 0, 'Pave': 1, 'Missing': 2}, {'Reg': 0, 'IR1': 1, 'Rare_var': 2, 'IR2': 3}, {'Bnk': 0, 'Lvl': 1, 'Low': 2, 'HLS': 3}, {'Rare_var': 0, 'AllPub': 1}, {'Corner': 0, 'Inside': 1, 'FR2': 2, 'Rare_var': 3, 'CulDSac': 4}, {'Gtl': 0, 'Mod': 1, 'Rare_var': 2}, {'BrDale': 0, 'IDOTRR': 1, 'BrkSide': 2, 'Edwards': 3, 'OldTown': 4, 'Sawyer': 5, 'SWISU': 6, 'NAmes': 7, 'Rare_var': 8, 'Mitchel': 9, 'SawyerW': 10, 'NWAmes': 11, 'Blmngtn': 12, 'Gilbert': 13, 'CollgCr': 14, 'Crawfor': 15, 'ClearCr': 16, 'Somerst': 17, 'Timber': 18, 'StoneBr': 19, 'NridgHt': 20, 'NoRidge': 21}, {'Artery': 0, 'Feedr': 1, 'Norm': 2, 'Rare_var': 3, 'RRAn': 4, 'PosN': 5}, {'Rare_var': 0, 'Norm': 1}, {'2fmCon': 0, 'Twnhs': 1, 'Duplex': 2, '1Fam': 3, 'TwnhsE': 4}, {'1.5Unf': 0, '1.5Fin': 1, 'SFoyer': 2, '1Story': 3, 'SLvl': 4, 'Rare_var': 5, '2Story': 6}, {'Gable': 0, 'Rare_var': 1, 'Hip': 2}, {'CompShg': 0, 'Rare_var': 1}, {'AsbShng': 0, 'Rare_var': 1, 'WdShing': 2, 'Wd Sdng': 3, 'MetalSd': 4, 'HdBoard': 5, 'Stucco': 6, 'Plywood': 7, 'BrkFace': 8, 'VinylSd': 9, 'CemntBd': 10}, {'AsbShng': 0, 'Wd Sdng': 1, 'MetalSd': 2, 'Wd Shng': 3, 'Stucco': 4, 'Rare_var': 5, 'HdBoard': 6, 'Plywood': 7, 'VinylSd': 8, 'BrkFace': 9, 'CmentBd': 10}, {'BrkCmn': 0, 'None': 1, 'BrkFace': 2, 'Rare_var': 3, 'Stone': 4}, {'Rare_var': 0, 'TA': 1, 'Gd': 2, 'Ex': 3}, {'Fa': 0, 'Rare_var': 1, 'Gd': 2, 'TA': 3}, {'Slab': 0, 'BrkTil': 1, 'CBlock': 2, 'Rare_var': 3, 'PConc': 4}, {'Missing': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}, {'Rare_var': 0, 'Missing': 1, 'Fa': 2, 'TA': 3, 'Gd': 4}, {'Missing': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}, {'Missing': 0, 'Rec': 1, 'LwQ': 2, 'BLQ': 3, 'ALQ': 4, 'Unf': 5, 'GLQ': 6}, {'Missing': 0, 'BLQ': 1, 'Rec': 2, 'LwQ': 3, 'Unf': 4, 'Rare_var': 5, 'ALQ': 6}, {'Rare_var': 0, 'GasW': 1, 'GasA': 2}, {'Rare_var': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}, {'N': 0, 'Y': 1}, {'FuseF': 0, 'Rare_var': 1, 'FuseA': 2, 'SBrkr': 3}, {'Fa': 0, 'TA': 1, 'Gd': 2, 'Ex': 3}, {'Rare_var': 0, 'Mod': 1, 'Min2': 2, 'Min1': 3, 'Typ': 4}, {'Po': 0, 'Missing': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, {'Missing': 0, 'Rare_var': 1, 'Detchd': 2, 'Basment': 3, 'Attchd': 4, 'BuiltIn': 5}, {'Missing': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}, {'Missing': 0, 'Fa': 1, 'Rare_var': 2, 'TA': 3, 'Gd': 4}, {'Missing': 0, 'Fa': 1, 'Rare_var': 2, 'TA': 3}, {'N': 0, 'P': 1, 'Y': 2}, {'Missing': 0, 'Rare_var': 1}, {'GdWo': 0, 'Rare_var': 1, 'MnPrv': 2, 'Missing': 3, 'GdPrv': 4}, {'Rare_var': 0, 'Shed': 1, 'Missing': 2}, {'COD': 0, 'Rare_var': 1, 'WD': 2, 'New': 3}, {'Abnorml': 0, 'Family': 1, 'Rare_var': 2, 'Normal': 3, 'Partial': 4}]
train.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | LotFrontage_nan | MasVnrArea_nan | GarageYrBlt_nan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 254 | 255 | 20 | 3 | 70.0 | 8400.0 | 1 | 2 | 0 | 1 | 1 | 1 | 0 | 7 | 2 | 1 | 3 | 3 | 5 | 6 | 53 | 53 | 0 | 0 | 4 | 2 | 1 | 0.000000 | 1 | 2 | 2 | 2 | 3 | 1 | 1 | 16.819902 | 4 | 0.0 | 33.897101 | 1314.0 | 2 | 2 | 1 | 3 | 12.228361 | 0.000000 | 0 | 17.622338 | 1 | 0 | 1 | 0 | 3 | 1 | 1 | 5 | 4 | 0 | 1 | 4 | 53.0 | 2 | 1 | 294.0 | 3 | 3 | 2 | 4.338126 | 0.000000 | 0.0 | 0 | 0.0 | 0 | 0 | 3 | 2 | 0 | 6 | 0 | 2 | 3 | 7.530705 | 0 | 0 | 0 |
| 1066 | 1067 | 60 | 3 | 59.0 | 7837.0 | 1 | 2 | 1 | 1 | 1 | 1 | 0 | 13 | 2 | 1 | 3 | 6 | 6 | 7 | 16 | 15 | 0 | 0 | 9 | 8 | 1 | 0.000000 | 2 | 3 | 4 | 3 | 3 | 1 | 5 | 0.000000 | 4 | 0.0 | 48.370017 | 799.0 | 2 | 3 | 1 | 3 | 10.942670 | 4.610315 | 0 | 18.514443 | 0 | 0 | 2 | 1 | 3 | 1 | 1 | 7 | 4 | 1 | 3 | 4 | 16.0 | 2 | 2 | 380.0 | 3 | 3 | 2 | 0.000000 | 4.090227 | 0.0 | 0 | 0.0 | 0 | 0 | 3 | 2 | 0 | 5 | 0 | 2 | 3 | 7.605930 | 0 | 0 | 0 |
| 638 | 639 | 30 | 3 | 67.0 | 8777.0 | 1 | 2 | 0 | 1 | 1 | 1 | 0 | 3 | 1 | 1 | 3 | 3 | 5 | 7 | 98 | 58 | 0 | 0 | 4 | 1 | 1 | 0.000000 | 1 | 3 | 2 | 1 | 3 | 1 | 5 | 0.000000 | 4 | 0.0 | 48.279947 | 796.0 | 2 | 3 | 1 | 2 | 10.933281 | 0.000000 | 0 | 15.300059 | 0 | 0 | 1 | 0 | 2 | 1 | 1 | 4 | 4 | 0 | 1 | 0 | 28.0 | 0 | 0 | 0.0 | 0 | 0 | 1 | 4.499425 | 0.000000 | 0.0 | 0 | 0.0 | 0 | 0 | 2 | 2 | 0 | 5 | 0 | 2 | 3 | 7.328603 | 0 | 0 | 1 |
| 799 | 800 | 50 | 3 | 60.0 | 7200.0 | 1 | 2 | 0 | 1 | 1 | 0 | 0 | 6 | 1 | 1 | 3 | 1 | 5 | 7 | 70 | 57 | 0 | 0 | 3 | 1 | 2 | 3.212329 | 1 | 3 | 1 | 3 | 3 | 1 | 4 | 14.572489 | 4 | 0.0 | 21.588988 | 731.0 | 2 | 4 | 1 | 3 | 11.462364 | 4.619083 | 0 | 19.124129 | 1 | 0 | 1 | 1 | 3 | 1 | 2 | 7 | 4 | 2 | 3 | 2 | 68.0 | 1 | 1 | 240.0 | 3 | 3 | 2 | 0.000000 | 0.000000 | 0.0 | 0 | 0.0 | 0 | 0 | 2 | 2 | 0 | 6 | 0 | 2 | 3 | 7.599743 | 0 | 0 | 0 |
| 380 | 381 | 50 | 3 | 50.0 | 5000.0 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 6 | 2 | 1 | 3 | 1 | 5 | 6 | 86 | 60 | 0 | 0 | 8 | 1 | 1 | 0.000000 | 1 | 3 | 1 | 2 | 3 | 1 | 2 | 10.796435 | 4 | 0.0 | 48.639168 | 1026.0 | 2 | 2 | 1 | 3 | 11.577920 | 4.541664 | 0 | 18.892444 | 0 | 0 | 2 | 0 | 3 | 1 | 2 | 6 | 4 | 1 | 4 | 2 | 86.0 | 1 | 1 | 308.0 | 3 | 3 | 2 | 0.000000 | 0.000000 | 0.0 | 0 | 0.0 | 0 | 0 | 3 | 2 | 0 | 5 | 0 | 2 | 3 | 7.481387 | 0 | 0 | 0 |
from sklearn.preprocessing import MinMaxScaler
scaled_feat = [feature for feature in train.columns if feature not in ['Id','SalePrice'] ]
s=MinMaxScaler()
s.fit(train[scaled_feat])
traine_data=pd.concat([train[['Id','SalePrice']].reset_index(drop=True),pd.DataFrame(s.transform(train[scaled_feat]),columns=scaled_feat)],axis=1)
traine_data.head()
| Id | SalePrice | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | LotFrontage_nan | MasVnrArea_nan | GarageYrBlt_nan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 255 | 7.530705 | 0.000000 | 0.75 | 0.505952 | 0.427279 | 1.0 | 1.0 | 0.000000 | 0.333333 | 1.0 | 0.25 | 0.0 | 0.333333 | 0.4 | 1.0 | 0.75 | 0.500000 | 0.444444 | 0.625 | 0.389706 | 0.885246 | 0.0 | 0.0 | 0.4 | 0.2 | 0.25 | 0.000000 | 0.333333 | 0.666667 | 0.50 | 0.50 | 0.75 | 0.25 | 0.166667 | 0.827560 | 0.666667 | 0.0 | 0.485295 | 0.632836 | 1.0 | 0.50 | 1.0 | 1.000000 | 0.709071 | 0.000000 | 0.0 | 0.595706 | 0.333333 | 0.0 | 0.333333 | 0.0 | 0.375 | 0.333333 | 0.333333 | 0.250000 | 1.0 | 0.000000 | 0.2 | 0.8 | 0.495327 | 0.666667 | 0.25 | 0.313349 | 0.75 | 1.0 | 1.0 | 0.934344 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.75 | 1.0 | 0.0 | 0.454545 | 0.0 | 0.666667 | 0.75 | 0.0 | 0.0 | 0.0 |
| 1 | 1067 | 7.605930 | 0.235294 | 0.75 | 0.375000 | 0.392509 | 1.0 | 1.0 | 0.333333 | 0.333333 | 1.0 | 0.25 | 0.0 | 0.619048 | 0.4 | 1.0 | 0.75 | 1.000000 | 0.555556 | 0.750 | 0.117647 | 0.262295 | 0.0 | 0.0 | 0.9 | 0.8 | 0.25 | 0.000000 | 0.666667 | 1.000000 | 1.00 | 0.75 | 0.75 | 0.25 | 0.833333 | 0.000000 | 0.666667 | 0.0 | 0.692499 | 0.376617 | 1.0 | 0.75 | 1.0 | 1.000000 | 0.435877 | 0.925289 | 0.0 | 0.687633 | 0.000000 | 0.0 | 0.666667 | 0.5 | 0.375 | 0.333333 | 0.333333 | 0.416667 | 1.0 | 0.333333 | 0.6 | 0.8 | 0.149533 | 0.666667 | 0.50 | 0.405009 | 0.75 | 1.0 | 1.0 | 0.000000 | 0.695354 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.75 | 1.0 | 0.0 | 0.363636 | 0.0 | 0.666667 | 0.75 | 0.0 | 0.0 | 0.0 |
| 2 | 639 | 7.328603 | 0.058824 | 0.75 | 0.470238 | 0.450562 | 1.0 | 1.0 | 0.000000 | 0.333333 | 1.0 | 0.25 | 0.0 | 0.142857 | 0.2 | 1.0 | 0.75 | 0.500000 | 0.444444 | 0.750 | 0.720588 | 0.967213 | 0.0 | 0.0 | 0.4 | 0.1 | 0.25 | 0.000000 | 0.333333 | 1.000000 | 0.50 | 0.25 | 0.75 | 0.25 | 0.833333 | 0.000000 | 0.666667 | 0.0 | 0.691209 | 0.375124 | 1.0 | 0.75 | 1.0 | 0.666667 | 0.433882 | 0.000000 | 0.0 | 0.356408 | 0.000000 | 0.0 | 0.333333 | 0.0 | 0.250 | 0.333333 | 0.333333 | 0.166667 | 1.0 | 0.000000 | 0.2 | 0.0 | 0.261682 | 0.000000 | 0.00 | 0.000000 | 0.00 | 0.0 | 0.5 | 0.969084 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.50 | 1.0 | 0.0 | 0.363636 | 0.0 | 0.666667 | 0.75 | 0.0 | 0.0 | 1.0 |
| 3 | 800 | 7.599743 | 0.176471 | 0.75 | 0.386905 | 0.353168 | 1.0 | 1.0 | 0.000000 | 0.333333 | 1.0 | 0.00 | 0.0 | 0.285714 | 0.2 | 1.0 | 0.75 | 0.166667 | 0.444444 | 0.750 | 0.514706 | 0.950820 | 0.0 | 0.0 | 0.3 | 0.1 | 0.50 | 0.957995 | 0.333333 | 1.000000 | 0.25 | 0.75 | 0.75 | 0.25 | 0.666667 | 0.716985 | 0.666667 | 0.0 | 0.309083 | 0.342786 | 1.0 | 1.00 | 1.0 | 1.000000 | 0.546306 | 0.927048 | 0.0 | 0.750458 | 0.333333 | 0.0 | 0.333333 | 0.5 | 0.375 | 0.333333 | 0.666667 | 0.416667 | 1.0 | 0.666667 | 0.6 | 0.4 | 0.635514 | 0.333333 | 0.25 | 0.255795 | 0.75 | 1.0 | 1.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.50 | 1.0 | 0.0 | 0.454545 | 0.0 | 0.666667 | 0.75 | 0.0 | 0.0 | 0.0 |
| 4 | 381 | 7.481387 | 0.176471 | 0.75 | 0.267857 | 0.217299 | 1.0 | 0.5 | 0.000000 | 0.333333 | 1.0 | 0.25 | 0.0 | 0.285714 | 0.4 | 1.0 | 0.75 | 0.166667 | 0.444444 | 0.625 | 0.632353 | 1.000000 | 0.0 | 0.0 | 0.8 | 0.1 | 0.25 | 0.000000 | 0.333333 | 1.000000 | 0.25 | 0.50 | 0.75 | 0.25 | 0.333333 | 0.531198 | 0.666667 | 0.0 | 0.696352 | 0.489552 | 1.0 | 0.50 | 1.0 | 1.000000 | 0.570860 | 0.911511 | 0.0 | 0.726584 | 0.000000 | 0.0 | 0.666667 | 0.0 | 0.375 | 0.333333 | 0.666667 | 0.333333 | 1.0 | 0.333333 | 0.8 | 0.4 | 0.803738 | 0.333333 | 0.25 | 0.328271 | 0.75 | 1.0 | 1.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.75 | 1.0 | 0.0 | 0.363636 | 0.0 | 0.666667 | 0.75 | 0.0 | 0.0 | 0.0 |
x_train = traine_data.drop(['Id','SalePrice'], axis=1)
y_train = traine_data['SalePrice']
x_train.shape , y_train.shape
((1168, 82), (1168,))
traine_data.to_csv('traine_.csv', index=False)
cat_nan_feat=[feature for feature in test.columns if test[feature].dtypes=='object'and test[feature].isnull().sum()>0]
len(cat_nan_feat)
15
def replace_nan(test,cat_nan_feat):
for feature in cat_nan_feat:
test[feature].fillna('Missing',inplace=True)
replace_nan(test,cat_nan_feat)
test[cat_nan_feat].isnull().sum()
Alley 0 MasVnrType 0 BsmtQual 0 BsmtCond 0 BsmtExposure 0 BsmtFinType1 0 BsmtFinType2 0 FireplaceQu 0 GarageType 0 GarageFinish 0 GarageQual 0 GarageCond 0 PoolQC 0 Fence 0 MiscFeature 0 dtype: int64
for feature in cat_var:
temp=test.groupby(feature)['SalePrice'].count()/len(test)
temp_df=temp[temp>0.01]
test[feature]=np.where(test[feature].isin(temp_df),test[feature],'rare_var')
for feature in cat_var:
labels_ordered=test.groupby(feature)['SalePrice'].mean().sort_values().index
labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
test[feature]=test[feature].map(labels_ordered)
test.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 892 | 893 | 20 | 0 | 70.0 | 8414.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 8 | 1963 | 2003 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 663.0 | 0 | 0.0 | 396.0 | 1059.0 | 0 | 0 | 0 | 0 | 1068.0 | 0.0 | 0 | 1068.0 | 0 | 1 | 1 | 0 | 3 | 1 | 0 | 6 | 0 | 0 | 0 | 0 | 1963.0 | 0 | 1 | 264.0 | 0 | 0 | 0 | 192.0 | 0.0 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 2 | 2006 | 0 | 0 | 154500 |
| 1105 | 1106 | 60 | 0 | 98.0 | 12256.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 5 | 1994 | 1995 | 0 | 0 | 0 | 0 | 0 | 362.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1032.0 | 0 | 0.0 | 431.0 | 1463.0 | 0 | 0 | 0 | 0 | 1500.0 | 1122.0 | 0 | 2622.0 | 1 | 0 | 2 | 1 | 3 | 1 | 0 | 9 | 0 | 2 | 0 | 0 | 1994.0 | 0 | 2 | 712.0 | 0 | 0 | 0 | 186.0 | 32.0 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 4 | 2010 | 0 | 0 | 325000 |
| 413 | 414 | 30 | 0 | 56.0 | 8960.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 6 | 1927 | 1950 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 | 1008.0 | 1008.0 | 0 | 0 | 0 | 0 | 1028.0 | 0.0 | 0 | 1028.0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 5 | 0 | 1 | 0 | 0 | 1927.0 | 0 | 2 | 360.0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 3 | 2010 | 0 | 0 | 115000 |
| 522 | 523 | 50 | 0 | 50.0 | 5000.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 7 | 1947 | 1950 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 399.0 | 0 | 0.0 | 605.0 | 1004.0 | 0 | 0 | 0 | 0 | 1004.0 | 660.0 | 0 | 1664.0 | 0 | 0 | 2 | 0 | 3 | 1 | 0 | 7 | 0 | 2 | 0 | 0 | 1950.0 | 0 | 2 | 420.0 | 0 | 0 | 0 | 0.0 | 24.0 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 10 | 2006 | 0 | 0 | 159000 |
| 1036 | 1037 | 20 | 0 | 89.0 | 12898.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 5 | 2007 | 2008 | 0 | 0 | 0 | 0 | 0 | 70.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1022.0 | 0 | 0.0 | 598.0 | 1620.0 | 0 | 0 | 0 | 0 | 1620.0 | 0.0 | 0 | 1620.0 | 1 | 0 | 2 | 0 | 2 | 1 | 0 | 6 | 0 | 1 | 0 | 0 | 2008.0 | 0 | 3 | 912.0 | 0 | 0 | 0 | 228.0 | 0.0 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 9 | 2009 | 0 | 0 | 315500 |
num_nan_features = [feature for feature in test.columns if test[feature].dtypes!='O' and test[feature].isnull().sum()>0]
for feature in num_nan_features:
median=test[feature].median()
test[feature+"_nan"] = np.where(test[feature].isnull(),1,0)
test[feature].fillna(median,inplace=True)
test[num_nan_features].isnull().sum()
LotFrontage 0 MasVnrArea 0 GarageYrBlt 0 dtype: int64
year_feat
['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
features=['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']
for feature in features:
test[feature]=test['YrSold']-test[feature]
test.columns
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
'SaleCondition', 'SalePrice', 'LotFrontage_nan', 'MasVnrArea_nan',
'GarageYrBlt_nan'],
dtype='object')
for feature in test.columns:
plt.hist(test[feature],bins=20)
plt.title(feature)
plt.show()
Disc_feat=[feature for feature in num_feat if len(test[feature].unique())<25 and feature not in year_feat+['Id']]
cont_feat=[feature for feature in num_feat if feature not in Disc_feat + year_feat+['Id']]
x=[feature for feature in test.columns if test[feature].dtypes!='object'and feature in cont_feat]
x
['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'SalePrice']
for feature in x:
print(feature,':',test[feature].skew())
LotFrontage : 0.13525007860287586 LotArea : 0.278935292144115 MasVnrArea : 1.407458483609809 BsmtFinSF1 : 0.8102608510060361 BsmtUnfSF : 0.8187601521572084 TotalBsmtSF : 0.2527861304487117 1stFlrSF : 0.8841106021878509 2ndFlrSF : 0.850190191444724 GrLivArea : 0.6744274686688129 GarageArea : 0.0945189419839681 WoodDeckSF : 0.9945802382684821 OpenPorchSF : 1.509618543232546 SalePrice : 2.2643948196066
skew_features=['MasVnrArea','BsmtFinSF1','BsmtUnfSF','1stFlrSF','2ndFlrSF','GrLivArea','WoodDeckSF','OpenPorchSF','SalePrice']
import scipy.stats as stat
for feature in skew_features:
test[feature],parameters=stat.boxcox(test[feature]+1)
print(feature,':',test[feature].skew())
MasVnrArea : 0.6309412532135422 BsmtFinSF1 : -0.3982084098229691 BsmtUnfSF : -0.32507601106345896 1stFlrSF : 0.008558223691437283 2ndFlrSF : 0.3679919122602979 GrLivArea : -0.006237786167460394 WoodDeckSF : 0.031061833934702614 OpenPorchSF : 0.08411681166435096 SalePrice : -0.012004687957725873
for feature in skew_features:
sns.histplot(test[feature])
print(feature,':',test[feature].skew())
plt.show()
MasVnrArea : 0.6309412532135422
BsmtFinSF1 : -0.3982084098229691
BsmtUnfSF : -0.32507601106345896
1stFlrSF : 0.008558223691437283
2ndFlrSF : 0.3679919122602979
GrLivArea : -0.006237786167460394
WoodDeckSF : 0.031061833934702614
OpenPorchSF : 0.08411681166435096
SalePrice : -0.012004687957725873
test.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | LotFrontage_nan | MasVnrArea_nan | GarageYrBlt_nan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 892 | 893 | 20 | 0 | 70.0 | 8414.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 8 | 43 | 3 | 0 | 0 | 0 | 0 | 0 | 0.000000 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 14.550354 | 0 | 0.0 | 36.932292 | 1059.0 | 0 | 0 | 0 | 0 | 3.456999 | 0.000000 | 0 | 13.101794 | 0 | 1 | 1 | 0 | 3 | 1 | 0 | 6 | 0 | 0 | 0 | 0 | 43.0 | 0 | 1 | 264.0 | 0 | 0 | 0 | 5.007722 | 0.000000 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 2 | 2006 | 0 | 0 | 7.699558 | 0 | 0 | 0 |
| 1105 | 1106 | 60 | 0 | 98.0 | 12256.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 5 | 16 | 15 | 0 | 0 | 0 | 0 | 0 | 2.454517 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16.513742 | 0 | 0.0 | 38.593592 | 1463.0 | 0 | 0 | 0 | 0 | 3.521783 | 4.140187 | 0 | 16.162483 | 1 | 0 | 2 | 1 | 3 | 1 | 0 | 9 | 0 | 2 | 0 | 0 | 16.0 | 0 | 2 | 712.0 | 0 | 0 | 0 | 4.979142 | 3.123739 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 4 | 2010 | 0 | 0 | 7.977640 | 0 | 0 | 0 |
| 413 | 414 | 30 | 0 | 56.0 | 8960.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 6 | 83 | 60 | 0 | 0 | 0 | 0 | 0 | 0.000000 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.000000 | 0 | 0.0 | 59.750559 | 1008.0 | 0 | 0 | 0 | 0 | 3.449396 | 0.000000 | 0 | 12.981532 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 5 | 0 | 1 | 0 | 0 | 83.0 | 0 | 2 | 360.0 | 0 | 0 | 0 | 0.000000 | 0.000000 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 3 | 2010 | 0 | 0 | 7.584483 | 0 | 0 | 0 |
| 522 | 523 | 50 | 0 | 50.0 | 5000.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 7 | 59 | 56 | 0 | 0 | 0 | 0 | 0 | 0.000000 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12.523429 | 0 | 0.0 | 45.989935 | 1004.0 | 0 | 0 | 0 | 0 | 3.444658 | 3.968194 | 0 | 14.556064 | 0 | 0 | 2 | 0 | 3 | 1 | 0 | 7 | 0 | 2 | 0 | 0 | 56.0 | 0 | 2 | 420.0 | 0 | 0 | 0 | 0.000000 | 2.901097 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 10 | 2006 | 0 | 0 | 7.710604 | 0 | 0 | 0 |
| 1036 | 1037 | 20 | 0 | 89.0 | 12898.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 5 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 2.185909 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16.468421 | 0 | 0.0 | 45.714995 | 1620.0 | 0 | 0 | 0 | 0 | 3.535768 | 0.000000 | 0 | 14.465111 | 1 | 0 | 2 | 0 | 2 | 1 | 0 | 6 | 0 | 1 | 0 | 0 | 1.0 | 0 | 3 | 912.0 | 0 | 0 | 0 | 5.162202 | 0.000000 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 9 | 2009 | 0 | 0 | 7.966859 | 0 | 0 | 0 |
from sklearn.preprocessing import MinMaxScaler
scaled_feat = [feature for feature in test.columns if feature not in ['Id','SalePrice'] ]
s=MinMaxScaler()
s.fit(test[scaled_feat])
test_data=pd.concat([test[['Id','SalePrice']].reset_index(drop=True),pd.DataFrame(s.transform(test[scaled_feat]),columns=scaled_feat)],axis=1)
test_data.head()
| Id | SalePrice | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | LotFrontage_nan | MasVnrArea_nan | GarageYrBlt_nan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 893 | 7.699558 | 0.000000 | 0.0 | 0.505952 | 0.427808 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500 | 0.857143 | 0.338583 | 0.050000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.757199 | 0.0 | 0.0 | 0.475890 | 0.505970 | 0.0 | 0.0 | 0.0 | 0.0 | 0.575363 | 0.000000 | 0.0 | 0.422665 | 0.0 | 0.5 | 0.333333 | 0.0 | 0.500000 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.488636 | 0.0 | 0.25 | 0.281375 | 0.0 | 0.0 | 0.0 | 0.877298 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.090909 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 1106 | 7.977640 | 0.235294 | 0.0 | 0.839286 | 0.665225 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.750 | 0.428571 | 0.125984 | 0.250000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.993476 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.859374 | 0.0 | 0.0 | 0.497296 | 0.706965 | 0.0 | 0.0 | 0.0 | 0.0 | 0.789418 | 0.966352 | 0.0 | 0.969231 | 0.5 | 0.0 | 0.666667 | 0.5 | 0.500000 | 0.0 | 0.0 | 0.666667 | 0.0 | 0.666667 | 0.0 | 0.0 | 0.181818 | 0.0 | 0.50 | 0.758860 | 0.0 | 0.0 | 0.0 | 0.872291 | 0.715990 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.272727 | 1.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 414 | 7.584483 | 0.058824 | 0.0 | 0.339286 | 0.461548 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.375 | 0.571429 | 0.653543 | 1.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.769914 | 0.480597 | 0.0 | 0.0 | 0.0 | 0.0 | 0.550242 | 0.000000 | 0.0 | 0.401189 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.222222 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.943182 | 0.0 | 0.50 | 0.383693 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.181818 | 1.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 523 | 7.710604 | 0.176471 | 0.0 | 0.267857 | 0.216839 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500 | 0.714286 | 0.464567 | 0.933333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.651718 | 0.0 | 0.0 | 0.592602 | 0.478607 | 0.0 | 0.0 | 0.0 | 0.0 | 0.534585 | 0.926207 | 0.0 | 0.682363 | 0.0 | 0.0 | 0.666667 | 0.0 | 0.500000 | 0.0 | 0.0 | 0.444444 | 0.0 | 0.666667 | 0.0 | 0.0 | 0.636364 | 0.0 | 0.50 | 0.447642 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.664958 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.818182 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 1037 | 7.966859 | 0.000000 | 0.0 | 0.732143 | 0.704897 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.875 | 0.428571 | 0.015748 | 0.016667 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.884755 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.857015 | 0.0 | 0.0 | 0.589059 | 0.785075 | 0.0 | 0.0 | 0.0 | 0.0 | 0.835627 | 0.000000 | 0.0 | 0.666121 | 0.5 | 0.0 | 0.666667 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.333333 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.011364 | 0.0 | 0.75 | 0.972022 | 0.0 | 0.0 | 0.0 | 0.904361 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.727273 | 0.75 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
x_test = test_data.drop(['Id','SalePrice'], axis=1)
y_test = test_data['SalePrice']
x_test.shape , y_test.shape
((292, 82), (292,))
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
# SelectFromModel : selects features from model whose coefficient is non-zero
model=SelectFromModel(Lasso(alpha=0.005,random_state=0))
# Always remember the seed value(random_state value); you have to use same value for test dataset
## More the alpha less features required
model.fit(x_train,y_train)
SelectFromModel(estimator=Lasso(alpha=0.005, random_state=0))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SelectFromModel(estimator=Lasso(alpha=0.005, random_state=0))
Lasso(alpha=0.005, random_state=0)
Lasso(alpha=0.005, random_state=0)
model.get_support()
array([False, False, False, True, False, False, False, False, False,
False, False, True, False, False, False, False, True, False,
False, True, False, False, False, False, False, True, False,
False, False, False, False, False, False, True, False, False,
False, True, False, False, False, False, False, False, False,
True, False, False, False, False, False, False, True, False,
False, False, True, True, False, True, False, True, False,
False, False, True, True, False, False, False, False, False,
False, False, False, False, False, False, False, False, False,
False])
selected_feat=x_train.columns[(model.get_support())]
selected_feat
Index(['LotArea', 'Neighborhood', 'OverallQual', 'YearRemodAdd', 'MasVnrArea',
'BsmtFinSF1', 'TotalBsmtSF', 'GrLivArea', 'KitchenQual', 'FireplaceQu',
'GarageType', 'GarageFinish', 'GarageArea', 'WoodDeckSF',
'OpenPorchSF'],
dtype='object')
print('Total features: {}'.format(x_train.shape[1]))
print('selected features: {}'.format(len(selected_feat)))
print("features with coeffecient zero: {}".format(np.sum(model.estimator_.coef_ == 0)))
Total features: 82 selected features: 15 features with coeffecient zero: 67
train_x = x_train[selected_feat]
train_y = traine_data['SalePrice']
test_x = x_test[selected_feat]
test_y = test_data['SalePrice']
train_x.shape , train_y.shape
test_x.shape,test_y.shape
((292, 15), (292,))
plt.figure(figsize=(10,10))
cor=train_x.corr()
sns.heatmap(cor,annot=True,cmap='coolwarm')
<Axes: >
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
l=LinearRegression()
l.fit(x_train,y_train)
y_pred=l.predict(x_test)
from sklearn.metrics import r2_score
r2_score(y_pred,test_y)
-1.8149227249625457
from sklearn.model_selection import KFold
folds=KFold(n_splits=5,shuffle=True,random_state=100)
scores=cross_val_score(l,train_x,train_y,scoring='r2',cv=5)
scores.mean()
0.872984794341404
from sklearn import metrics
linear_score = cross_val_score(l,test_x,test_y,cv=5)
print('Cross Val Score: ', linear_score.mean())
print('Mean Absolute Error:', mean_absolute_error(test_y, y_pred))
print('Mean Squared Error:',mean_squared_error(test_y, y_pred))
linear_RMSE = np.sqrt(mean_squared_error(test_y, y_pred))
print('Root Mean Squared Error:', linear_RMSE )
Cross Val Score: 0.8564504681790399 Mean Absolute Error: 69.25674559492849 Mean Squared Error: 7436.544527847517 Root Mean Squared Error: 86.2354018245843
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
kf = KFold(shuffle=True, random_state=0, n_splits=3)
estimator = Pipeline([("polynomial_features", PolynomialFeatures()),
("ridge_regression", Ridge())])
params = {
'polynomial_features__degree': [1, 2, 3],
'ridge_regression__alpha': [1e-15,1e-10,1e-8,1e-3,0.005,1e-2,0.05,0.1,0.5,1,5,10,20,30,35,40]
}
# 'alpha':[1e-15,1e-10,1e-8,1e-3,0.005,1e-2,0.05,0.1,0.5,1,5,10,20,30,35,40]
grid = GridSearchCV(estimator, params, cv=kf)
grid.fit(train_x,train_y)
print('best score: {}'.format(grid.best_score_))
print('best score: {}'.format(grid.best_params_))
C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=2.03888e-18): result may not be accurate. return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=1.95988e-18): result may not be accurate. return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=1.97335e-18): result may not be accurate. return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=2.30314e-19): result may not be accurate. return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=2.21057e-19): result may not be accurate. return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=2.25051e-19): result may not be accurate. return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_ridge.py:255: UserWarning: Singular matrix in solving dual problem. Using least-squares solution instead. warnings.warn( C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_ridge.py:255: UserWarning: Singular matrix in solving dual problem. Using least-squares solution instead. warnings.warn( C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_ridge.py:253: LinAlgWarning: Ill-conditioned matrix (rcond=1.03569e-18): result may not be accurate. dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
best score: 0.8710007754103465
best score: {'polynomial_features__degree': 1, 'ridge_regression__alpha': 0.1}
ridge = Ridge(alpha=0.1)
ridge.fit(train_x,train_y)
y_pred = ridge.predict(test_x)
ridge_score = cross_val_score(ridge,test_x,test_y,cv=10)
print('Cross Val Score: ', ridge_score.mean())
ridge_RMSE = np.sqrt(mean_squared_error(test_y, y_pred))
print('Root Mean Squared Error:', ridge_RMSE)
Cross Val Score: 0.8606347335018991 Root Mean Squared Error: 0.27740277050635215
from sklearn.model_selection import GridSearchCV
lasso=Lasso()
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,0.005,1e-2,0.05,0.1,0.5,1,5,10,20,30,35,40]}
lasso_regressor=GridSearchCV(lasso,parameters,cv=kf)
lasso_regressor.fit(x_train,y_train)
print(lasso_regressor.best_params_)
print(lasso_regressor.best_score_)
C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 7.409e-01, tolerance: 1.564e-03 model = cd_fast.enet_coordinate_descent( C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.778e-02, tolerance: 1.609e-03 model = cd_fast.enet_coordinate_descent( C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 6.655e-01, tolerance: 1.575e-03 model = cd_fast.enet_coordinate_descent( C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 6.907e-01, tolerance: 1.564e-03 model = cd_fast.enet_coordinate_descent( C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.536e-02, tolerance: 1.564e-03 model = cd_fast.enet_coordinate_descent(
{'alpha': 1e-15}
0.8913916436750625
C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.678e-01, tolerance: 2.374e-03 model = cd_fast.enet_coordinate_descent(
lasso = Lasso(alpha=0.001)
lasso.fit(train_x,train_y)
y_pred = lasso.predict(test_x)
lasso_score = cross_val_score(lasso,test_x,test_y,cv=10)
print('Cross Val Score: ', lasso_score.mean())
lasso_RMSE = np.sqrt(mean_squared_error(test_y, y_pred))
print('Root Mean Squared Error:', lasso_RMSE)
Cross Val Score: 0.8607689533495189 Root Mean Squared Error: 0.2765120843288113
print('Linear Regression score: {} , RMSE: {}'.format(linear_score.mean(),linear_RMSE))
print('Ridge Regression score: {} , RMSE: {}'.format(ridge_score.mean(),ridge_RMSE))
print('Lasso Regression score: {} , RMSE: {}'.format(lasso_score.mean() ,lasso_RMSE))
Linear Regression score: 0.8564504681790399 , RMSE: 86.2354018245843 Ridge Regression score: 0.8606347335018991 , RMSE: 0.27740277050635215 Lasso Regression score: 0.8607689533495189 , RMSE: 0.2765120843288113
Lasso Regressor performs better than linear and ridge and have high CV score and less RMSE
from sklearn.svm import SVR
svm_regressor = SVR()
parameters = {'kernel':['poly','rbf'] , 'degree':[2,3,4,5] }
svm_reg = GridSearchCV(svm_regressor,parameters,cv=kf)
svm_reg.fit(train_x,train_y)
print('Best Parameters: {}'.format(svm_reg.best_params_))
print('Best SCore: {}'.format(svm_reg.best_score_))
Best Parameters: {'degree': 2, 'kernel': 'rbf'}
Best SCore: 0.8001388309664638
svm_regressor = SVR(kernel='rbf', degree=2)
svm_regressor.fit(train_x,train_y)
y_pred = svm_regressor.predict(test_x)
SVM_score = cross_val_score(svm_regressor,test_x,test_y,cv=10)
print('Cross Val Score: ', SVM_score.mean())
SVM_RMSE = np.sqrt(mean_squared_error(test_y, y_pred))
print('Root Mean Squared Error:', SVM_RMSE)
Cross Val Score: 0.7691136815622601 Root Mean Squared Error: 0.30500190639881103
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
rf = RandomForestRegressor()
#Randomized Search CV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]
# Method of selecting samples for training each tree
# bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf}
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
# n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = 1)
# rf_random.fit(train_x,train_y)
# print(rf_random.best_params_)
# print('score: {}' .format(rf_random.best_score_))
rf = RandomForestRegressor(n_estimators=500, min_samples_split=2,
min_samples_leaf=1, max_features='sqrt',
max_depth=15)
rf.fit(train_x,train_y)
y_pred = rf.predict(test_x)
RF_score = cross_val_score(rf,test_x,test_y,cv=10)
print('Cross Val Score: ', RF_score.mean())
RF_RMSE = np.sqrt(mean_squared_error(test_y, y_pred))
print('Root Mean Squared Error:', RF_RMSE)
Cross Val Score: 0.838707183638175 Root Mean Squared Error: 0.27086556001851975
from sklearn.neighbors import KNeighborsRegressor
accuracy=[]
k=0
temp=0
for n in range(1,50):
knn=KNeighborsRegressor(n_neighbors=n)
result= cross_val_score(knn, test_x, test_y, cv=10)
accuracy.append(result.mean())
if (result.mean() > temp):
temp=result.mean()
k=n
plt.figure(figsize=(10,8))
axes = plt.axes()
axes.grid()
plt.plot(range(1,50), accuracy, color='blue', linestyle=':',
marker='o', markerfacecolor='red',markersize=10 )
plt.title('accuracy vs K-value')
plt.xlabel('K-value')
plt.xticks(range(0,50,2))
plt.ylabel('accuracy')
print('Best K-value: {}'.format(k))
Best K-value: 9
knn=KNeighborsRegressor(n_neighbors=3)
knn.fit(train_x,train_y)
y_pred = knn.predict(test_x)
knn_score = cross_val_score(knn,test_x,test_y,cv=10)
print('Cross Val Score: ', knn_score.mean())
knn_RMSE = np.sqrt(mean_squared_error(test_y, y_pred))
print('Root Mean Squared Error:', knn_RMSE)
Cross Val Score: 0.6588580491418964 Root Mean Squared Error: 0.2975340522610127
print('Linear Regression score: {} , RMSE: {}'.format(linear_score.mean(),linear_RMSE))
print('Ridge Regression score: {} , RMSE: {}'.format(ridge_score.mean(),ridge_RMSE))
print('Lasso Regression score: {} , RMSE: {}'.format(lasso_score.mean() ,lasso_RMSE))
print('KNN Regression score: {} , RMSE: {}'.format(knn_score.mean() ,knn_RMSE))
print('SVM Regression score: {} , RMSE: {}'.format(SVM_score.mean() ,SVM_RMSE))
print('Random Forest score: {} , RMSE: {}'.format(RF_score.mean() ,RF_RMSE))
Linear Regression score: 0.8564504681790399 , RMSE: 86.2354018245843 Ridge Regression score: 0.8606347335018991 , RMSE: 0.27740277050635215 Lasso Regression score: 0.8607689533495189 , RMSE: 0.2765120843288113 KNN Regression score: 0.6588580491418964 , RMSE: 0.2975340522610127 SVM Regression score: 0.7691136815622601 , RMSE: 0.30500190639881103 Random Forest score: 0.838707183638175 , RMSE: 0.27086556001851975
Lasso has performed well out of all with highest cross validation score
test_df = pd.read_csv("C:\\Users\\91741\\Downloads\\test (2).csv")
test_df.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1461 | 20 | RH | 80.0 | 11622 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | 5 | 6 | 1961 | 1961 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | CBlock | TA | TA | No | Rec | 468.0 | LwQ | 144.0 | 270.0 | 882.0 | GasA | TA | Y | SBrkr | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | NaN | Attchd | 1961.0 | Unf | 1.0 | 730.0 | TA | TA | Y | 140 | 0 | 0 | 0 | 120 | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal |
| 1 | 1462 | 20 | RL | 81.0 | 14267 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1958 | 1958 | Hip | CompShg | Wd Sdng | Wd Sdng | BrkFace | 108.0 | TA | TA | CBlock | TA | TA | No | ALQ | 923.0 | Unf | 0.0 | 406.0 | 1329.0 | GasA | TA | Y | SBrkr | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | Gd | 6 | Typ | 0 | NaN | Attchd | 1958.0 | Unf | 1.0 | 312.0 | TA | TA | Y | 393 | 36 | 0 | 0 | 0 | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal |
| 2 | 1463 | 60 | RL | 74.0 | 13830 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 5 | 5 | 1997 | 1998 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | PConc | Gd | TA | No | GLQ | 791.0 | Unf | 0.0 | 137.0 | 928.0 | GasA | Gd | Y | SBrkr | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1997.0 | Fin | 2.0 | 482.0 | TA | TA | Y | 212 | 34 | 0 | 0 | 0 | 0 | NaN | MnPrv | NaN | 0 | 3 | 2010 | WD | Normal |
| 3 | 1464 | 60 | RL | 78.0 | 9978 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 6 | 6 | 1998 | 1998 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 20.0 | TA | TA | PConc | TA | TA | No | GLQ | 602.0 | Unf | 0.0 | 324.0 | 926.0 | GasA | Ex | Y | SBrkr | 926 | 678 | 0 | 1604 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Attchd | 1998.0 | Fin | 2.0 | 470.0 | TA | TA | Y | 360 | 36 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 6 | 2010 | WD | Normal |
| 4 | 1465 | 120 | RL | 43.0 | 5005 | Pave | NaN | IR1 | HLS | AllPub | Inside | Gtl | StoneBr | Norm | Norm | TwnhsE | 1Story | 8 | 5 | 1992 | 1992 | Gable | CompShg | HdBoard | HdBoard | None | 0.0 | Gd | TA | PConc | Gd | TA | No | ALQ | 263.0 | Unf | 0.0 | 1017.0 | 1280.0 | GasA | Ex | Y | SBrkr | 1280 | 0 | 0 | 1280 | 0.0 | 0.0 | 2 | 0 | 2 | 1 | Gd | 5 | Typ | 0 | NaN | Attchd | 1992.0 | RFn | 2.0 | 506.0 | TA | TA | Y | 0 | 82 | 0 | 0 | 144 | 0 | NaN | NaN | NaN | 0 | 1 | 2010 | WD | Normal |
def replace_nan(test_df,cat_nan_feat):
for feature in cat_nan_feat:
test_df[feature].fillna('Missing',inplace=True)
replace_nan(test_df,cat_nan_feat)
test_df.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1461 | 20 | RH | 80.0 | 11622 | Pave | Missing | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | 5 | 6 | 1961 | 1961 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | CBlock | TA | TA | No | Rec | 468.0 | LwQ | 144.0 | 270.0 | 882.0 | GasA | TA | Y | SBrkr | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | Missing | Attchd | 1961.0 | Unf | 1.0 | 730.0 | TA | TA | Y | 140 | 0 | 0 | 0 | 120 | 0 | Missing | MnPrv | Missing | 0 | 6 | 2010 | WD | Normal |
| 1 | 1462 | 20 | RL | 81.0 | 14267 | Pave | Missing | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1958 | 1958 | Hip | CompShg | Wd Sdng | Wd Sdng | BrkFace | 108.0 | TA | TA | CBlock | TA | TA | No | ALQ | 923.0 | Unf | 0.0 | 406.0 | 1329.0 | GasA | TA | Y | SBrkr | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | Gd | 6 | Typ | 0 | Missing | Attchd | 1958.0 | Unf | 1.0 | 312.0 | TA | TA | Y | 393 | 36 | 0 | 0 | 0 | 0 | Missing | Missing | Gar2 | 12500 | 6 | 2010 | WD | Normal |
| 2 | 1463 | 60 | RL | 74.0 | 13830 | Pave | Missing | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 5 | 5 | 1997 | 1998 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | PConc | Gd | TA | No | GLQ | 791.0 | Unf | 0.0 | 137.0 | 928.0 | GasA | Gd | Y | SBrkr | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1997.0 | Fin | 2.0 | 482.0 | TA | TA | Y | 212 | 34 | 0 | 0 | 0 | 0 | Missing | MnPrv | Missing | 0 | 3 | 2010 | WD | Normal |
| 3 | 1464 | 60 | RL | 78.0 | 9978 | Pave | Missing | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 6 | 6 | 1998 | 1998 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 20.0 | TA | TA | PConc | TA | TA | No | GLQ | 602.0 | Unf | 0.0 | 324.0 | 926.0 | GasA | Ex | Y | SBrkr | 926 | 678 | 0 | 1604 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Attchd | 1998.0 | Fin | 2.0 | 470.0 | TA | TA | Y | 360 | 36 | 0 | 0 | 0 | 0 | Missing | Missing | Missing | 0 | 6 | 2010 | WD | Normal |
| 4 | 1465 | 120 | RL | 43.0 | 5005 | Pave | Missing | IR1 | HLS | AllPub | Inside | Gtl | StoneBr | Norm | Norm | TwnhsE | 1Story | 8 | 5 | 1992 | 1992 | Gable | CompShg | HdBoard | HdBoard | None | 0.0 | Gd | TA | PConc | Gd | TA | No | ALQ | 263.0 | Unf | 0.0 | 1017.0 | 1280.0 | GasA | Ex | Y | SBrkr | 1280 | 0 | 0 | 1280 | 0.0 | 0.0 | 2 | 0 | 2 | 1 | Gd | 5 | Typ | 0 | Missing | Attchd | 1992.0 | RFn | 2.0 | 506.0 | TA | TA | Y | 0 | 82 | 0 | 0 | 144 | 0 | Missing | Missing | Missing | 0 | 1 | 2010 | WD | Normal |
temp_df
SaleCondition Abnorml 0.082192 Alloca 0.017123 Normal 0.801370 Partial 0.092466 Name: SalePrice, dtype: float64
for i,feature in enumerate(cat_var):
test_df[feature]=test_df[feature].map(all_labels[i])
num_nan_features = [feature for feature in test_df.columns if test_df[feature].dtypes!='O'
and test_df[feature].isnull().sum()>0]
for feature in num_nan_features:
median=test_df[feature].median()
test_df[feature].fillna(median,inplace=True)
for feature in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
test_df[feature] = test_df['YrSold']-test_df[feature]
test_df.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1461 | 20 | 2.0 | 80.0 | 11622 | 1.0 | 2 | 0.0 | 1 | 1.0 | 1.0 | 0.0 | 7.0 | 1.0 | 1.0 | 3 | 3.0 | 5 | 6 | 49 | 49 | 0.0 | 0.0 | 9.0 | 8.0 | 1.0 | 0.0 | 1.0 | 3.0 | 2.0 | 2 | 3.0 | 1 | 1 | 468.0 | 3.0 | 144.0 | 270.0 | 882.0 | 2.0 | 2.0 | 1 | 3.0 | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | 1.0 | 5 | 4.0 | 0 | 1 | 4.0 | 49.0 | 1 | 1.0 | 730.0 | 3.0 | 3.0 | 2 | 140 | 0 | 0 | 0 | 120 | 0 | 0.0 | 2.0 | 2.0 | 0 | 6 | 2010 | 2.0 | 3.0 |
| 1 | 1462 | 20 | 3.0 | 81.0 | 14267 | 1.0 | 2 | 1.0 | 1 | 1.0 | 0.0 | 0.0 | 7.0 | 2.0 | 1.0 | 3 | 3.0 | 6 | 6 | 52 | 52 | 2.0 | 0.0 | 3.0 | 1.0 | 2.0 | 108.0 | 1.0 | 3.0 | 2.0 | 2 | 3.0 | 1 | 4 | 923.0 | 4.0 | 0.0 | 406.0 | 1329.0 | 2.0 | 2.0 | 1 | 3.0 | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | 2.0 | 6 | 4.0 | 0 | 1 | 4.0 | 52.0 | 1 | 1.0 | 312.0 | 3.0 | 3.0 | 2 | 393 | 36 | 0 | 0 | 0 | 0 | 0.0 | 3.0 | 2.0 | 12500 | 6 | 2010 | 2.0 | 3.0 |
| 2 | 1463 | 60 | 3.0 | 74.0 | 13830 | 1.0 | 2 | 1.0 | 1 | 1.0 | 1.0 | 0.0 | 13.0 | 2.0 | 1.0 | 3 | 6.0 | 5 | 5 | 13 | 12 | 0.0 | 0.0 | 9.0 | 8.0 | 1.0 | 0.0 | 1.0 | 3.0 | 4.0 | 3 | 3.0 | 1 | 6 | 791.0 | 4.0 | 0.0 | 137.0 | 928.0 | 2.0 | 3.0 | 1 | 3.0 | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | 1.0 | 6 | 4.0 | 1 | 3 | 4.0 | 13.0 | 3 | 2.0 | 482.0 | 3.0 | 3.0 | 2 | 212 | 34 | 0 | 0 | 0 | 0 | 0.0 | 2.0 | 2.0 | 0 | 3 | 2010 | 2.0 | 3.0 |
| 3 | 1464 | 60 | 3.0 | 78.0 | 9978 | 1.0 | 2 | 1.0 | 1 | 1.0 | 1.0 | 0.0 | 13.0 | 2.0 | 1.0 | 3 | 6.0 | 6 | 6 | 12 | 12 | 0.0 | 0.0 | 9.0 | 8.0 | 2.0 | 20.0 | 1.0 | 3.0 | 4.0 | 2 | 3.0 | 1 | 6 | 602.0 | 4.0 | 0.0 | 324.0 | 926.0 | 2.0 | 4.0 | 1 | 3.0 | 926 | 678 | 0 | 1604 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | 2.0 | 7 | 4.0 | 1 | 4 | 4.0 | 12.0 | 3 | 2.0 | 470.0 | 3.0 | 3.0 | 2 | 360 | 36 | 0 | 0 | 0 | 0 | 0.0 | 3.0 | 2.0 | 0 | 6 | 2010 | 2.0 | 3.0 |
| 4 | 1465 | 120 | 3.0 | 43.0 | 5005 | 1.0 | 2 | 1.0 | 3 | 1.0 | 1.0 | 0.0 | 19.0 | 2.0 | 1.0 | 4 | 3.0 | 8 | 5 | 18 | 18 | 0.0 | 0.0 | 5.0 | 6.0 | 1.0 | 0.0 | 2.0 | 3.0 | 4.0 | 3 | 3.0 | 1 | 4 | 263.0 | 4.0 | 0.0 | 1017.0 | 1280.0 | 2.0 | 4.0 | 1 | 3.0 | 1280 | 0 | 0 | 1280 | 0.0 | 0.0 | 2 | 0 | 2 | 1 | 2.0 | 5 | 4.0 | 0 | 1 | 4.0 | 18.0 | 2 | 2.0 | 506.0 | 3.0 | 3.0 | 2 | 0 | 82 | 0 | 0 | 144 | 0 | 0.0 | 3.0 | 2.0 | 0 | 1 | 2010 | 2.0 | 3.0 |
test_final=test_df[selected_feat]
l=Lasso(alpha=1e-15)
l.fit(train_x,train_y)
h=l.predict(test_final)
h
array([1063.41780238, 1361.16129202, 1379.74766253, ..., 1714.04332051,
964.19046163, 1220.71493951])
test_df['SalePrice']=h
test_df.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1461 | 20 | 2.0 | 80.0 | 11622 | 1.0 | 2 | 0.0 | 1 | 1.0 | 1.0 | 0.0 | 7.0 | 1.0 | 1.0 | 3 | 3.0 | 5 | 6 | 49 | 49 | 0.0 | 0.0 | 9.0 | 8.0 | 1.0 | 0.0 | 1.0 | 3.0 | 2.0 | 2 | 3.0 | 1 | 1 | 468.0 | 3.0 | 144.0 | 270.0 | 882.0 | 2.0 | 2.0 | 1 | 3.0 | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | 1.0 | 5 | 4.0 | 0 | 1 | 4.0 | 49.0 | 1 | 1.0 | 730.0 | 3.0 | 3.0 | 2 | 140 | 0 | 0 | 0 | 120 | 0 | 0.0 | 2.0 | 2.0 | 0 | 6 | 2010 | 2.0 | 3.0 | 1063.417802 |
| 1 | 1462 | 20 | 3.0 | 81.0 | 14267 | 1.0 | 2 | 1.0 | 1 | 1.0 | 0.0 | 0.0 | 7.0 | 2.0 | 1.0 | 3 | 3.0 | 6 | 6 | 52 | 52 | 2.0 | 0.0 | 3.0 | 1.0 | 2.0 | 108.0 | 1.0 | 3.0 | 2.0 | 2 | 3.0 | 1 | 4 | 923.0 | 4.0 | 0.0 | 406.0 | 1329.0 | 2.0 | 2.0 | 1 | 3.0 | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | 2.0 | 6 | 4.0 | 0 | 1 | 4.0 | 52.0 | 1 | 1.0 | 312.0 | 3.0 | 3.0 | 2 | 393 | 36 | 0 | 0 | 0 | 0 | 0.0 | 3.0 | 2.0 | 12500 | 6 | 2010 | 2.0 | 3.0 | 1361.161292 |
| 2 | 1463 | 60 | 3.0 | 74.0 | 13830 | 1.0 | 2 | 1.0 | 1 | 1.0 | 1.0 | 0.0 | 13.0 | 2.0 | 1.0 | 3 | 6.0 | 5 | 5 | 13 | 12 | 0.0 | 0.0 | 9.0 | 8.0 | 1.0 | 0.0 | 1.0 | 3.0 | 4.0 | 3 | 3.0 | 1 | 6 | 791.0 | 4.0 | 0.0 | 137.0 | 928.0 | 2.0 | 3.0 | 1 | 3.0 | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | 1.0 | 6 | 4.0 | 1 | 3 | 4.0 | 13.0 | 3 | 2.0 | 482.0 | 3.0 | 3.0 | 2 | 212 | 34 | 0 | 0 | 0 | 0 | 0.0 | 2.0 | 2.0 | 0 | 3 | 2010 | 2.0 | 3.0 | 1379.747663 |
| 3 | 1464 | 60 | 3.0 | 78.0 | 9978 | 1.0 | 2 | 1.0 | 1 | 1.0 | 1.0 | 0.0 | 13.0 | 2.0 | 1.0 | 3 | 6.0 | 6 | 6 | 12 | 12 | 0.0 | 0.0 | 9.0 | 8.0 | 2.0 | 20.0 | 1.0 | 3.0 | 4.0 | 2 | 3.0 | 1 | 6 | 602.0 | 4.0 | 0.0 | 324.0 | 926.0 | 2.0 | 4.0 | 1 | 3.0 | 926 | 678 | 0 | 1604 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | 2.0 | 7 | 4.0 | 1 | 4 | 4.0 | 12.0 | 3 | 2.0 | 470.0 | 3.0 | 3.0 | 2 | 360 | 36 | 0 | 0 | 0 | 0 | 0.0 | 3.0 | 2.0 | 0 | 6 | 2010 | 2.0 | 3.0 | 1128.018976 |
| 4 | 1465 | 120 | 3.0 | 43.0 | 5005 | 1.0 | 2 | 1.0 | 3 | 1.0 | 1.0 | 0.0 | 19.0 | 2.0 | 1.0 | 4 | 3.0 | 8 | 5 | 18 | 18 | 0.0 | 0.0 | 5.0 | 6.0 | 1.0 | 0.0 | 2.0 | 3.0 | 4.0 | 3 | 3.0 | 1 | 4 | 263.0 | 4.0 | 0.0 | 1017.0 | 1280.0 | 2.0 | 4.0 | 1 | 3.0 | 1280 | 0 | 0 | 1280 | 0.0 | 0.0 | 2 | 0 | 2 | 1 | 2.0 | 5 | 4.0 | 0 | 1 | 4.0 | 18.0 | 2 | 2.0 | 506.0 | 3.0 | 3.0 | 2 | 0 | 82 | 0 | 0 | 144 | 0 | 0.0 | 3.0 | 2.0 | 0 | 1 | 2010 | 2.0 | 3.0 | 756.362138 |
submission_data=test_df[['Id','SalePrice']]
submission_data.to_csv('Suuubbmission.csv',index=False)